df <- st_read('data/CA16_CTs_all.gpkg')
## Reading layer `CA16_CTs_all' from data source `/Users/sn/projects/connectivity-and-lfp/data/CA16_CTs_all.gpkg' using driver `GPKG'
## Simple feature collection with 5608 features and 47 fields
## geometry type:  MULTIPOLYGON
## dimension:      XY
## bbox:           xmin: -124.6992 ymin: 42.01779 xmax: -52.61941 ymax: 60.00006
## geographic CRS: WGS 84
df$percent_hh_with_children <- 100 * df$hh_with_children / df$hh_total
df$percent_drivers_female <- 100 * df$commute_driver_female / (df$commute_driver_male + df$commute_driver_female)
df$percent_publictransit_female <- 100 * df$commute_publictransit_female / (df$commute_publictransit_male + df$commute_publictransit_female)
df$lfp_gap <- df$lfp_male - df$lfp_female
df <- na.omit(df)
df_no_geom <- st_drop_geometry(df)

descriptives

iv_colnames <- c("pca1_stock", "med_hh_income_1000", "avg_rooms_per_dwelling", "percent_hh_with_children", "percent_drivers_female", "percent_publictransit_female")
lfp <- rbind(data.frame(lfp = df$lfp_female, gender ='Female'), data.frame(lfp = df$lfp_male, gender ='Male'))
ggplot(lfp, aes(x=lfp, fill=gender)) + geom_histogram(alpha=0.5, position="identity") + scale_x_continuous(breaks=seq(0,100,10)) + labs(x='LFP (%)') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

avg_gap <- mean(df$lfp_gap)
ggplot(df_no_geom, aes(lfp_gap)) + geom_histogram(color="black", fill="white", binwidth = 2) + geom_vline(xintercept = avg_gap, color='red') + labs(x='percentage points')

avg_sndi <- mean(df$pca1_stock)
ggplot(df, aes(pca1_stock)) + geom_histogram(color="black", fill="white") + geom_vline(xintercept=avg_sndi, color='red') + labs(x='SNDI') 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

st_drop_geometry(df[iv_colnames]) %>% summarise_all(mean) %>% t()
##                                   [,1]
## pca1_stock                    2.226660
## med_hh_income_1000           78.232846
## avg_rooms_per_dwelling        6.168332
## percent_hh_with_children     40.896490
## percent_drivers_female       44.342552
## percent_publictransit_female 57.126427

determinants of LFP

df_vars <- df_no_geom[iv_colnames]
df_vars$lfp_female <- df_no_geom$lfp_female
model_all <- lm(lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + percent_hh_with_children + percent_drivers_female + percent_publictransit_female , data=df_vars)
summary(model_all)
## 
## Call:
## lm(formula = lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + 
##     percent_hh_with_children + percent_drivers_female + percent_publictransit_female, 
##     data = df_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.865  -4.210   0.369   4.732  41.482 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  46.169783   1.061722  43.486  < 2e-16 ***
## pca1_stock                   -0.492749   0.072938  -6.756 1.57e-11 ***
## med_hh_income_1000            0.176581   0.005692  31.022  < 2e-16 ***
## avg_rooms_per_dwelling       -2.767874   0.160250 -17.272  < 2e-16 ***
## percent_hh_with_children      0.071829   0.010555   6.805 1.12e-11 ***
## percent_drivers_female        0.364052   0.023530  15.472  < 2e-16 ***
## percent_publictransit_female  0.017646   0.007318   2.411   0.0159 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.348 on 5418 degrees of freedom
## Multiple R-squared:  0.2095, Adjusted R-squared:  0.2087 
## F-statistic: 239.4 on 6 and 5418 DF,  p-value: < 2.2e-16
plot(residuals(model_all), ylab='residuals')
abline(h=0, col='red', lw=2)

plot(residuals(model_all) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_all), main=NULL)
qqline(residuals(model_all))

bptest(model_all)  # heteroskedasticity
## 
##  studentized Breusch-Pagan test
## 
## data:  model_all
## BP = 687.66, df = 6, p-value < 2.2e-16
model_no_sndi <- lm(lfp_female ~ . -pca1_stock, data=df_vars)
anova(model_no_sndi, model_all)
## Analysis of Variance Table
## 
## Model 1: lfp_female ~ (pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + 
##     percent_hh_with_children + percent_drivers_female + percent_publictransit_female) - 
##     pca1_stock
## Model 2: lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + 
##     percent_hh_with_children + percent_drivers_female + percent_publictransit_female
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   5419 294965                                  
## 2   5418 292501  1    2463.9 45.639 1.571e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

multilevel model

df_vars$csd_uid <- df$csd_uid
model_csd <- lmer(lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + percent_hh_with_children + percent_drivers_female + percent_publictransit_female + (1 | csd_uid), data=df_vars)
summary(model_csd)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: 
## lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling +  
##     percent_hh_with_children + percent_drivers_female + percent_publictransit_female +  
##     (1 | csd_uid)
##    Data: df_vars
## 
## REML criterion at convergence: 36633.9
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9213 -0.5249  0.0599  0.5894  5.9513 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  csd_uid  (Intercept) 13.61    3.689   
##  Residual             46.70    6.833   
## Number of obs: 5425, groups:  csd_uid, 390
## 
## Fixed effects:
##                                Estimate Std. Error         df t value Pr(>|t|)
## (Intercept)                   4.809e+01  1.271e+00  5.034e+03  37.825  < 2e-16
## pca1_stock                   -6.062e-01  7.769e-02  5.055e+03  -7.802 7.33e-15
## med_hh_income_1000            1.822e-01  6.557e-03  4.740e+03  27.789  < 2e-16
## avg_rooms_per_dwelling       -3.054e+00  1.760e-01  5.196e+03 -17.359  < 2e-16
## percent_hh_with_children      1.055e-01  1.152e-02  5.346e+03   9.157  < 2e-16
## percent_drivers_female        3.430e-01  2.464e-02  5.405e+03  13.921  < 2e-16
## percent_publictransit_female  2.100e-02  7.270e-03  5.087e+03   2.889  0.00389
##                                 
## (Intercept)                  ***
## pca1_stock                   ***
## med_hh_income_1000           ***
## avg_rooms_per_dwelling       ***
## percent_hh_with_children     ***
## percent_drivers_female       ***
## percent_publictransit_female ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) pc1_st m___10 avg___ prc___ prcnt_d_
## pca1_stock   0.102                                     
## md_hh__1000  0.221  0.054                              
## avg_rms_pr_ -0.287 -0.230 -0.671                       
## prcnt_hh_w_ -0.037 -0.142 -0.043 -0.445                
## prcnt_drvr_ -0.772 -0.065 -0.064 -0.152  0.163         
## prcnt_pblc_ -0.295  0.022  0.120 -0.018 -0.093 -0.028
plot(residuals(model_csd), ylab='residuals')

plot(residuals(model_csd) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_csd))
qqline(residuals(model_csd))

LFP gender diff ~ .

df_vars$lfp_gap <- df$lfp_gap
model_gap <- lm(lfp_gap ~ . -lfp_female -csd_uid, data=df_vars)
summary(model_gap)
## 
## Call:
## lm(formula = lfp_gap ~ . - lfp_female - csd_uid, data = df_vars)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.333  -2.333  -0.028   2.428  17.017 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  22.042585   0.570525  38.636  < 2e-16 ***
## pca1_stock                    0.314628   0.039194   8.027 1.21e-15 ***
## med_hh_income_1000            0.017064   0.003059   5.579 2.54e-08 ***
## avg_rooms_per_dwelling       -0.282646   0.086112  -3.282  0.00104 ** 
## percent_hh_with_children      0.024156   0.005672   4.259 2.09e-05 ***
## percent_drivers_female       -0.305210   0.012644 -24.139  < 2e-16 ***
## percent_publictransit_female -0.026643   0.003933  -6.775 1.38e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.948 on 5418 degrees of freedom
## Multiple R-squared:  0.1459, Adjusted R-squared:  0.1449 
## F-statistic: 154.2 on 6 and 5418 DF,  p-value: < 2.2e-16
plot(residuals(model_gap), ylab='residuals')

plot(residuals(model_gap) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_gap), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_gap))

Comparative by SNDI

df_no_geom %>% 
  group_by(csd_uid) %>% 
  summarise(sndi_csd = mean(pca1_stock), pop_csd = sum(population)) %>% 
  filter(pop_csd > 1000000) %>%
  arrange(sndi_csd)
## # A tibble: 3 x 3
##   csd_uid sndi_csd pop_csd
##   <chr>      <dbl>   <int>
## 1 2466023    0.464 1702338
## 2 3520005    1.51  2722663
## 3 4806016    2.53  1238541
low_sndi <- filter(df, csd_uid == '2466023')
high_sndi <- filter(df, csd_uid == '4806016')

Low SNDI

ggplot(low_sndi, aes(fill=lfp_female)) + geom_sf() + scale_fill_viridis_b() + labs(fill='LFP (%)') +  theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

ggplot(low_sndi, aes(fill=pca1_stock)) + geom_sf() + scale_fill_viridis_b() + labs(fill='SNDI')+  theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

low_sndi_reg <- st_drop_geometry(low_sndi[iv_colnames])
low_sndi_reg$lfp_female <- low_sndi$lfp_female
model_low <- lm(lfp_female ~ ., data=low_sndi_reg)
summary(model_low)
## 
## Call:
## lm(formula = lfp_female ~ ., data = low_sndi_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.609  -4.252   0.335   5.195  17.346 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  45.66045    3.96479  11.516  < 2e-16 ***
## pca1_stock                   -1.99298    0.36715  -5.428 9.24e-08 ***
## med_hh_income_1000            0.14596    0.03271   4.462 1.02e-05 ***
## avg_rooms_per_dwelling        1.34355    0.95630   1.405 0.160714    
## percent_hh_with_children     -0.33336    0.04929  -6.763 4.12e-11 ***
## percent_drivers_female        0.22963    0.06538   3.512 0.000489 ***
## percent_publictransit_female  0.08154    0.06121   1.332 0.183440    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.845 on 459 degrees of freedom
## Multiple R-squared:  0.3333, Adjusted R-squared:  0.3246 
## F-statistic: 38.24 on 6 and 459 DF,  p-value: < 2.2e-16
plot(residuals(model_low), ylab='residuals')

plot(residuals(model_low) ~ low_sndi_reg$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_low), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_low))

High SNDI

ggplot(high_sndi, aes(fill=lfp_female)) + geom_sf() + scale_fill_viridis_b() + labs(fill='LFP (%)') +  theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

ggplot(high_sndi, aes(fill=pca1_stock)) + geom_sf() + scale_fill_viridis_b() + labs(fill='SNDI') +  theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

high_sndi_reg <- st_drop_geometry(high_sndi[iv_colnames])
high_sndi_reg$lfp_female <- high_sndi$lfp_female
model_high <- lm(lfp_female ~ ., data=high_sndi_reg)
summary(model_high)
## 
## Call:
## lm(formula = lfp_female ~ ., data = high_sndi_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.8531  -3.1539   0.0895   3.1920  12.5510 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  26.87653    5.97527   4.498 1.11e-05 ***
## pca1_stock                   -0.81836    0.32834  -2.492   0.0134 *  
## med_hh_income_1000            0.11516    0.01985   5.802 2.27e-08 ***
## avg_rooms_per_dwelling       -5.05750    0.51895  -9.746  < 2e-16 ***
## percent_hh_with_children      0.16178    0.03985   4.060 6.83e-05 ***
## percent_drivers_female        0.92715    0.10393   8.921  < 2e-16 ***
## percent_publictransit_female  0.29001    0.05487   5.285 3.02e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.988 on 220 degrees of freedom
## Multiple R-squared:  0.4963, Adjusted R-squared:  0.4825 
## F-statistic: 36.12 on 6 and 220 DF,  p-value: < 2.2e-16
plot(residuals(model_high), ylab='residuals')

plot(residuals(model_high) ~ high_sndi_reg$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_high), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_high))

grouped by SNDI?

df_sndi <- data.frame(sndi=df_no_geom$pca1_stock, sndi_level=if_else(df_no_geom$pca1_stock<2.52, '>= avg', '< avg'), lfp=df_no_geom$lfp_female)
ggplot(df_sndi, aes(x=sndi, y=lfp)) + geom_point(alpha=0.2) + geom_smooth(method='lm', se=FALSE, aes(color = sndi_level)) + labs(y='Female LFP (%)', x='SNDI', color='SNDI') +theme_bw() 
## `geom_smooth()` using formula 'y ~ x'